HW 03

Author

Weston Scott

1 - Du Bois challenge.

income <- read.csv("data/income.csv")

income <- income |>
    mutate(
        Average_Income = as.integer(Average_Income),
        ClassLabel = factor(
            paste0(Class, " | $", 
                   format(Average_Income, 
                          big.mark = ",")
                  ),
            levels = unique(paste0(Class, " | $", 
                               format(Average_Income, 
                                      big.mark = ",")))
            )
    ) |>

    pivot_longer(
        cols = Rent:Other, ## list slice-like syntax to get the ordered columns
        names_to = "Category", 
        values_to = "Percent"
    ) |>

    mutate(
        Category = factor(Category, 
                          levels = c("Other", 
                                     "Tax", 
                                     "Clothes", 
                                     "Food", 
                                     "Rent")),
        text_color = ifelse(Category == "Rent", 
                            "white", 
                            "black")
    ) |>

    group_by(ClassLabel) |>
    mutate(pos = cumsum(Percent) - Percent / 2) |>
    ungroup() |> glimpse()
Rows: 35
Columns: 7
$ Class          <chr> "$100-200", "$100-200", "$100-200", "$10…
$ Average_Income <int> 139, 139, 139, 139, 139, 249, 249, 249, …
$ ClassLabel     <fct> "$100-200 | $  139", "$100-200 | $  139"…
$ Category       <fct> Rent, Food, Clothes, Tax, Other, Rent, F…
$ Percent        <dbl> 19.0, 43.0, 28.0, 9.9, 0.1, 22.0, 47.0, …
$ text_color     <chr> "white", "black", "black", "black", "bla…
$ pos            <dbl> 9.50, 40.50, 76.00, 94.95, 99.95, 11.00,…
category_colors <- c(
    Rent = "black",
    Food = "slateblue4",
    Clothes = "rosybrown2",
    Tax = "gray60",
    Other = "tan"
)
du_bois <- ggplot(
    income, 
    aes(x = fct_rev(ClassLabel), 
        y = Percent, 
        fill = Category)
    ) +

    geom_col(color = "black", 
             width = 0.7) +

    geom_text(data = filter(income, 
                            Percent > 1),
              aes(label = paste0(round(Percent, 1), "%"), 
                  y = pos, 
                  color = text_color), 
              size = 2.5
    ) +
    scale_fill_manual(values = category_colors) +
    scale_color_manual(values = c("white" = "white", 
                                  "black" = "black")) +

    coord_flip() +
    scale_y_continuous(labels = NULL) +
  
    annotate("text", 
             x = c(1, 2.5, 4.5, 6.5), 
             y = 102, 
             label = c("Well-To-Do", 
                       "Comfortable", 
                       "Fair", 
                       "Poor"), 
             size = 2.5, 
             angle = 90) +
  
  labs(
      x = NULL, 
      y = NULL, 
      title = "Annual Expenditure For Provided Data",
      text_color = NULL
  ) +

  theme(
      axis.title = element_blank(),
      axis.text.y = element_text(face = "bold", 
                                 size = 8),
      panel.grid = element_blank(),
      legend.title = element_blank(),
      legend.position = "top",
      plot.title = element_text(hjust = 0.5, margin = margin(b = 10))
  ) +

  guides(fill = guide_legend(reverse = TRUE), 
         color = "none")

ggbackground(du_bois, "images/paper.jpg")

2 - COVID survey - interpret

The first relationship I noticed was the question “Had flu vaccine this year”. I am understanding that the error bar lengths for the “No” response seem much longer than those of the “Yes” response for all top level COVID questions. It would be my assumption that those individuals that do not obtain the flu shot likely follow more information on the subject matter of whether or not the COVID vaccine is safe versus not safe. Those that get the flu shot appear to have responses overall that are more centrally located to the means, telling me that they are either not informed or maybe are simply not as concerned with the situation as compared to those that did not get the flu shot.

Example 2

Looking at the “I trust information that I have received about the vaccines” column has a very small deviation from a localized mean across the board. Every combination appears to have show low confidence, leading to small error bars, with more survey results in the lower values for that question.

Example 3

Something interesting that I think is quite visible with the provided image is that for the entire set of data, there are 2 question columns that tend to have the most diverse results, meaning the widest spread of answers, or the 10th and 90th percentile bars are on average the longest. The questions are “Based on my understanding, I believe the vaccine is safe” and “I am concerned about the safety and side effects of the vaccine.” I would say that these results would directly reflect information (or misinformation) dispersed to the masses. The length of the bars suggest that there are more people on either end of the spectrum for the questions then there are neutral responses.

Example 4

A final observation I made looking at this dataset involved the age demographic against the “Based on my understanding, I believe the vaccine is safe”. There is are large error bars for each age group except the youngest group. I attribute that to simply youth not being as informed relative to the information that is being dispersed. The spread of the survey results for the youth is minimal.

3 - COVID survey - reconstruct

covid_survey <- read.csv("data/covid-survey.csv")

covid_survey <- covid_survey |> 
    row_to_names(row_number = 1) |>
    clean_names() |>
    mutate(
        across(everything(), 
               ~ na_if(., ""))
    ) |>

    filter(
        if_any(-response_id, 
               ~ !is.na(.))
    ) |> glimpse()
Rows: 1,111
Columns: 14
$ response_id             <chr> "1", "2", "4", "5", "6", "7", "…
$ exp_profession          <chr> "1", "1", "1", "1", "1", "1", "…
$ exp_flu_vax             <chr> "1", "1", "1", "1", "1", "1", "…
$ exp_gender              <chr> "0", "1", "0", "0", "1", "1", "…
$ exp_race                <chr> "2", "2", "5", "5", "5", "5", "…
$ exp_ethnicity           <chr> "2", "2", "2", "2", "2", "2", "…
$ exp_age_bin             <chr> "25", "20", "25", "25", "25", "…
$ exp_already_vax         <chr> "1", "1", "1", "1", "1", "1", "…
$ resp_safety             <chr> "5", "5", "5", "5", "5", "5", "…
$ resp_confidence_science <chr> "2", "1", "1", "1", "1", "1", "…
$ resp_concern_safety     <chr> "2", "1", "1", "1", "1", "1", "…
$ resp_feel_safe_at_work  <chr> "1", "1", "1", "1", "1", "1", "…
$ resp_will_recommend     <chr> "1", "1", "1", "1", "1", "1", "…
$ resp_trust_info         <chr> "1", "1", "1", "1", "1", "2", "…
covid_survey <- covid_survey |>
    mutate(
        response_id = recode(response_id,
                             "1" ="Yes", 
                             "0" = "No"),
        exp_profession = recode(exp_profession,
                                "1" = "Nursing", 
                                "0" = "Medical"),
        exp_gender = recode(exp_gender,
                            "0" = "Male",
                            "1" = "Female",
                            "3" = "Non-binary/Third gender",
                            "4" = "Prefer not to say"),
        exp_race = recode(exp_race,
                          "1" = "American Indian/Alaskan Native",
                          "2" = "Asian",
                          "3" = "Black/African American",
                          "4" = "Native Hawaiian/Other Pacific Islander",
                          "5" = "White"),
        exp_ethnicity = recode(exp_ethnicity,
                               "1" = "Hispanic/Latino",
                               "2" = "Non-Hispanic/Non-Latino"),
        exp_age_bin = recode(exp_age_bin,
                             "0" = "<20",
                             "20" = "21-25",
                             "25" = "25-30",
                             "30" = ">30")
    
    ) |> glimpse()
Rows: 1,111
Columns: 14
$ response_id             <chr> "Yes", "2", "4", "5", "6", "7",…
$ exp_profession          <chr> "Nursing", "Nursing", "Nursing"…
$ exp_flu_vax             <chr> "1", "1", "1", "1", "1", "1", "…
$ exp_gender              <chr> "Male", "Female", "Male", "Male…
$ exp_race                <chr> "Asian", "Asian", "White", "Whi…
$ exp_ethnicity           <chr> "Non-Hispanic/Non-Latino", "Non…
$ exp_age_bin             <chr> "25-30", "21-25", "25-30", "25-…
$ exp_already_vax         <chr> "1", "1", "1", "1", "1", "1", "…
$ resp_safety             <chr> "5", "5", "5", "5", "5", "5", "…
$ resp_confidence_science <chr> "2", "1", "1", "1", "1", "1", "…
$ resp_concern_safety     <chr> "2", "1", "1", "1", "1", "1", "…
$ resp_feel_safe_at_work  <chr> "1", "1", "1", "1", "1", "1", "…
$ resp_will_recommend     <chr> "1", "1", "1", "1", "1", "1", "…
$ resp_trust_info         <chr> "1", "1", "1", "1", "1", "2", "…
covid_survey_longer <- covid_survey |>
    pivot_longer(
        cols = starts_with("exp_"),
        names_to = "explanatory",
        values_to = "explanatory_value"
    ) |>
    filter(!is.na(explanatory_value)) |>
    pivot_longer(
        cols = starts_with("resp_"),
        names_to = "response",
        values_to = "response_value"
    ) |> glimpse()
Rows: 43,428
Columns: 5
$ response_id       <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Y…
$ explanatory       <chr> "exp_profession", "exp_profession", "…
$ explanatory_value <chr> "Nursing", "Nursing", "Nursing", "Nur…
$ response          <chr> "resp_safety", "resp_confidence_scien…
$ response_value    <chr> "5", "2", "2", "1", "1", "1", "5", "2…

4 - COVID survey - re-reconstruct

5 - COVID survey - another view